/*
Author: David Sturrock
Created: September 2019
Description: creates definitve record of death status and date of death using
admin data and household grid.

We know that 4,549 individuals who were in ELSA have died (as of April 2018)
Of these, only 33 do not have date of death info from either the admin or hhgrid.
*/

********************************************************************************
* Preamble
********************************************************************************

clear all
set more off
set maxvar 10000

********************************************************************************
* ELSA death data: NHS records
// get the year and month of death where got this through admin link
********************************************************************************	

	clear
	use idauniq linkage_cm linkage2017 linkage2013 mdeath ydeath using "$mortality\Cancer_Mort_Index_v1.dta"
	gen know_status_admin = linkage_cm == 1
	gen dodyr = .
	gen dodmnth = .
	replace dodyr = ydeath if know_status_admin
	replace dodmnth = mdeath if know_status_admin
	
	tempfile AdminSurv
	save `AdminSurv'

********************************************************************************
* ELSA household grid
// get the year and month of death when reported for attriters
// this will be used only to fill in for those who declined admin link
// Note that out of 1026 obs where we know that person died, 571 have no date of 
// death info in HH grid (but could still have info from admin records).
********************************************************************************	


	forv w = 1(1)1 {
		clear 
		use idauniq indoc dhwhat dhdead dhmon idahhw1 using "$hhgrid\hhgridw`w'.dta"
		drop if idauniq == . //not sure why there are any missing idauniq. are these people who never responded?
		
		* merge in interview dates as these are not in the wave 1 household grid *
		merge 1:1 idauniq using "M:\ELSA\data\InterviewDate\ELSAInterviewDateAllWaves.dta", keepusing(intdthw1)
		bys idahhw1 : egen intdat = min(intdthw1)
		assert intdat != . if (_m ==1 | _m ==3)
		drop if _m == 2
		drop _m intdthw1

		gen know_status_grid = inlist(indoc,11,13,21,23,24,25,43,44,46,51,52,53,54,57,59,79,86,88,92,95)

		* in w1 dead and found to be ineligible is one category of interview outcome
		* code that dead if in this category and have given a year of death
		gen know_dead_grid = know_status & indoc == 79 & dhdead >0 & dhdead !=.
		gen know_alive_grid = know_status & indoc == 79 & dhdead == -1

		gen int_year = year(intdat)
		gen int_month = month(intdat)

		rename dhdead dodyr_grid
		recode dodyr_grid -8/-1 = .
		rename dhmon dodmnth_grid
		recode dodmnth_grid -8/-1 = .
		gen know_dod_grid = dodyr_grid !=. & dodmnth_grid != .
		gen know_yod_grid = dodyr_grid !=. 
		
		gen wave = `w'
		
		bys idauniq (know_status_grid int_year int_month) : gen num = _n
		bys idauniq : egen max_n = max(num)
		keep if num == max_n

		keep idauniq know_alive_grid know_dead_grid know_status_grid know_dod_grid know_yod_grid int_year int_month dodyr_grid dodmnth_grid wave

		tempfile HHgridSurvW`w'
		save `HHgridSurvW`w''
		}

	forv w = 2(1)2 {
		clear 
		use idauniq w`w'indout dhwhat dhdead dhmon intdaty intdatm using "$hhgrid\hhgridw`w'.dta"

		gen know_status_grid = inlist(w`w'indout,11,13,21,23,24,25,43,44,46,51,52,53,54,57,59,88,92,95)

		gen know_dead_grid = know_status & dhwhat == 1
		gen know_alive_grid = know_status & dhwhat != 1

		gen int_year = intdaty
		gen int_month = intdatm

		rename dhdead dodyr_grid
		recode dodyr_grid -8/-1 = .
		rename dhmon dodmnth_grid
		recode dodmnth_grid -8/-1 = .
		gen know_dod_grid = dodyr_grid !=. & dodmnth_grid != .
		gen know_yod_grid = dodyr_grid !=. 
		
		gen wave = `w'
		
		* some have multiple obs. happens if e.g. reported not present in HH and then found later *
		* we take the latest interview where we know their status (or latest interview if don't ever know status) *
		bys idauniq (know_status_grid int_year int_month) : gen num = _n
		bys idauniq : egen max_n = max(num)
		keep if num == max_n
		
		keep idauniq know_alive_grid know_dead_grid know_status_grid know_dod_grid know_yod_grid int_year int_month dodyr_grid dodmnth_grid wave

		tempfile HHgridSurvW`w'
		save `HHgridSurvW`w''
		}
		
		
	forv w = 3(1)3 {
		clear 
		use idauniq w`w'indout dhwhat intdaty intdatm using "$hhgrid\hhgridw`w'.dta"
		drop if idauniq == .

		gen know_status_grid = inlist(w`w'indout,11,13,21,23,24,25,43,44,46,51,52,53,54,57,59,88,92,95)

		gen know_dead_grid = know_status & dhwhat == 1
		gen know_alive_grid = know_status & dhwhat != 1

		gen int_year = intdaty
		gen int_month = intdatm

		gen dodyr_grid = int_year - 1
		gen dodmnth_grid = 6
		gen know_dod_grid = dodyr_grid !=. & dodmnth_grid != .
		gen know_yod_grid = dodyr_grid !=. 
		
		gen wave = `w'
		
		* create flag for dodgy date of death *
		gen imp_dod = .
		replace imp_dod = 1 if know_dead_grid
		
		* some have multiple obs. happens if e.g. reported not present in HH and then found later *
		* we take the latest interview where we know their status (or latest interview if don't ever know status) *
		bys idauniq (know_status_grid int_year int_month) : gen num = _n
		bys idauniq : egen max_n = max(num)
		keep if num == max_n
		
		keep idauniq know_alive_grid know_dead_grid know_status_grid know_dod_grid know_yod_grid int_year int_month dodyr_grid dodmnth_grid wave imp_dod

		tempfile HHgridSurvW`w'
		save `HHgridSurvW`w''
		}
	

	forv w = 4(1)4 {
		clear 
		use idauniq w`w'indout dhwhat dhdead mdeath intdat using "$hhgrid\hhgridw`w'.dta"

		gen know_status_grid = inlist(w`w'indout,11,13,21,23,24,25,43,44,46,51,52,53,54,57,59,88,92,95)

		gen know_dead_grid = know_status & dhwhat == 1
		gen know_alive_grid = know_status & dhwhat != 1

		gen int_year = year(intdat)
		gen int_month = month(intdat)

		rename dhdead dodyr_grid
		recode dodyr_grid -8/-1 = .
		rename mdeath dodmnth_grid
		recode dodmnth_grid -8/-1 = .
		gen know_dod_grid = dodyr_grid !=. & dodmnth_grid != .
		gen know_yod_grid = dodyr_grid !=. 
		
		gen wave = `w'
		
		bys idauniq (know_status_grid int_year int_month) : gen num = _n
		bys idauniq : egen max_n = max(num)
		keep if num == max_n
		
		keep idauniq know_alive_grid know_dead_grid know_status_grid know_dod_grid know_yod_grid int_year int_month dodyr_grid dodmnth_grid wave

		tempfile HHgridSurvW`w'
		save `HHgridSurvW`w''
		}

	forv w = 5(1)5 {
		clear 
		use idauniq w`w'indout dhwhat mdeath dhdead iintdaty iintdatm using "$hhgrid\hhgridw`w'.dta"

		gen know_status_grid = inlist(w`w'indout,11,13,21,23,24,25,43,44,46,51,52,53,54,57,59,88,92,95)

		gen know_dead_grid = know_status & dhwhat == 1
		gen know_alive_grid = know_status & dhwhat != 1

		gen int_year = iintdaty
		gen int_month = iintdatm

		rename dhdead dodyr_grid
		recode dodyr_grid -8/-1 = .
		rename mdeath dodmnth_grid
		recode dodmnth_grid -8/-1 = .
		gen know_dod_grid = dodyr_grid !=. & dodmnth_grid != .
		gen know_yod_grid = dodyr_grid !=. 
		
		gen wave = `w'
		
		bys idauniq (know_status_grid int_year int_month) : gen num = _n
		bys idauniq : egen max_n = max(num)
		keep if num == max_n
		
		keep idauniq know_alive_grid know_dead_grid know_status_grid know_dod_grid know_yod_grid int_year int_month dodyr_grid dodmnth_grid wave

		tempfile HHgridSurvW`w'
		save `HHgridSurvW`w''
		}

	forv w = 6(1)8 {
		clear 
		use idauniq w`w'indout dhwhat dhdead dhmond iintdat_ using "$hhgrid\hhgridw`w'.dta"

		gen know_status_grid = inlist(w`w'indout,11,13,21,23,24,25,43,44,46,51,52,53,54,57,59,88,92,95)

		gen know_dead_grid = know_status & dhwhat == 1
		gen know_alive_grid = know_status & dhwhat != 1

		gen int_year = year(iintdat_)
		gen int_month = month(iintdat_)

		rename dhdead dodyr_grid
		recode dodyr_grid -8/-1 = .
		rename dhmond dodmnth_grid
		recode dodmnth_grid -8/-1 = .
		gen know_dod_grid = dodyr_grid !=. & dodmnth_grid != .
		gen know_yod_grid = dodyr_grid !=. 
		
		gen wave = `w'
		
		bys idauniq (know_status_grid int_year int_month) : gen num = _n
		bys idauniq : egen max_n = max(num)
		keep if num == max_n
		
		keep idauniq know_alive_grid know_dead_grid know_status_grid know_dod_grid know_yod_grid int_year int_month dodyr_grid dodmnth_grid wave

		tempfile HHgridSurvW`w'
		save `HHgridSurvW`w''
		}
		
		* Append all waves together *
		
		clear
		use `HHgridSurvW1'
		forv w = 2(1)8 {
		append using `HHgridSurvW`w''
		}
		
		* Take latest observation of each where we know death status *
		keep if know_status_grid == 1
		bys idauniq : egen final_wave = max(wave)
		gen last_ob = wave == final_wave
		keep if last_ob

		tempfile HHgridSurv
		save `HHgridSurv'
		
********************************************************************************
* Make definitive survival record
********************************************************************************

	clear
	use `AdminSurv'
	drop if idauniq ==908472    /* Natcen found am incorrect idauniq just before the data were archived. This person appears twice (second time with the idauniq 167760). So this duplicate is dropped */
	merge 1:1 idauniq using `HHgridSurv'
	assert _m != 1
	drop _m
	
	* create var for whether know death status at all from either source *
	gen know_status = know_status_admin
	replace know_status = know_status_grid if know_status_admin == 0 | know_status_admin == .
	
	* all of the admin linked info tells us death status up to April 2018 *
	gen yr_know = 2018 if know_status_admin
	gen mth_know = 4 if know_status_admin
	
	* if know status from grid then know until month of relevant interview *
	replace yr_know = int_year if know_status_admin == 0 & know_status_grid == 1
	replace mth_know = int_month if know_status_admin == 0 & know_status_grid == 1
	
	* make dummy for if dead *
	gen dead = .
	replace dead = 1 if inrange(ydeath,2002,2018) | (know_status_admin == 0 & know_dead_grid == 1)
	replace dead = 0 if know_status & dead == .
	
	* year of death *
	replace dodyr = . if dodyr < 0 // this is the admin variable
	replace dodyr = dodyr_grid if dead & know_status_admin == 0 & know_status_grid == 1
	
	* month of death *
	replace dodmnth = . if dodmnth < 0
	replace dodmnth = dodmnth_grid if dead & know_status_admin == 0 & know_status_grid == 1

	* create dummy for missing date of death amongst those who we know dead: 33 cases *
	gen dod_miss = 0
	replace dod_miss = 1 if dead & dodyr == .

	keep idauniq know_status know_status_admin dead yr_know mth_know dodyr dodmnth dod_miss imp_dod
	
	* label vars *
	label variable know_status "Know whether alive or dead (as of latest date status known)"
	label variable know_status_admin "Death information comes from admin link"
	label variable yr_know "Latest year in which death status known"
	label variable mth_know "Latest month in which death status known"
	label variable dead "Whether dead (as of latest date status known)"
	label variable dodyr "Year died if known dead"
	label variable dodmnth "Month died if known dead"
	label variable dod_miss "Known to have died but no date of death"
	label variable imp_dod "Known to have died: date of death roughly imputed"
	
	save "M:\ELSA\data\mortality\CombinedMortalityRecord\CombinedMortalityRecord.dta", replace
